#ifdef ENABLE_ARM32

.text
.align 5
.global ConvDwFp32Border
#ifndef __APPLE__
.type ConvDwFp32Border, %function
#endif

// void ConvDwFp32Border(float *dst, const float *src, const float *weight, const float *bias, size_t height, size_t width,
//                       size_t in_kh_step, size_t in_kw_step, size_t kernel_w, size_t relu, size_t relu6)
// r0: dst, r1: src, r2: weight, r3: bias, r4: height, r5: width, r6: in_kh_step, r7: in_kw_step,
// r8: kernel_w, r9: relu, r10: relu6
ConvDwFp32Border:
    // r4-r8 and q4-q7 must be saved according to https://static.docs.arm.com/ihi0042/i/aapcs32.pdf
    push {r4-r12, lr}
    vpush {q4-q7}
    add sp, sp, #104

    ldr r4, [sp]       //  height
    ldr r5, [sp, #4]   //  width
    ldr r6, [sp, #8]   //  in_kh_step
    ldr r7, [sp, #12]  //  in_kw_step
    ldr r8, [sp, #16]  //  kernel_w
    ldr r9, [sp, #20]  // relu
    ldr r10, [sp, #24]  // relu6

    vld1.32 {q0}, [r3] // bias
    vmov.i32 q1, #6    // relu6
    vcvt.f32.s32 q1, q1
    veor q2, q2, q2  // relu

    LoopH:
        mov r11, r1
        mov r12, r2
        mov r14, r5
        LoopW:
            vld1.32 {q3}, [r11], r7
            vld1.32 {q4}, [r12]!
            vmla.f32 q0, q3, q4
            subs r14, r14, #1
            bne LoopW
        subs r4, r4, #1
        add r1, r1, r6
        add r2, r2, r8
        bne LoopH

    cmp r10, #0
    bne Relu6
    cmp r9, #0
    bne Relu
    b Write
    Relu6:
        vmin.f32 q0, q0, q1
    Relu:
        vmax.f32 q0, q0, q2
    Write:
        vst1.32 {q0}, [r0]

    sub sp, sp, #104
    vpop {q4-q7}
    pop {r4-r12, pc}
#endif
